************************************************
************************************************
* The relationship between NM and Real Accidents
************************************************
************************************************



clear 
clear matrix
set memory 1000m
set more off
cap log close

cd "/Users/..."

global do_file="‎⁨/Users/.../do_file"
global log_file="/Users/.../log_file"
global raw_data="/Users/.../raw_data⁩⁩"
global working_data="/Users/.../working_data"
global results="/Users/.../results/072020/calibration"


log using log_file/calibration_072020.log, replace



*************************
** clean drive and quotes
*************************

use working_data/merge_quote_drive_081619.dta, clear

order user_id vin
gsort user_id -vin

bysort user_id: replace vin=vin[_n-1] if vin=="" & vin[_n-1]~=""

bysort vin user_id: gen temp=_n
gen temp1=1 if temp==1
bysort vin: egen temp2=total(temp1)

count if temp2>1 // meaning that one vin may corresponds to multiple user_id
tab temp2

bysort vin user_id: gen temp3=_N
bysort vin: egen temp4=max(temp3)

keep if temp3==temp4 
drop temp temp1 temp2

bysort vin user_id: gen temp=_n
gen temp1=1 if temp==1
bysort vin: egen temp2=total(temp1)

count if temp2>1 // meaning that one vin may corresponds to multiple user_id
tab temp2

list user_id vin temp3 temp4 if temp2>1
sort vin temp

bysort vin: gen temp5=_n if temp==1
bysort vin user_id: egen temp6=total(temp5)
drop if temp6>1

drop temp temp1 temp2 temp3 temp4 temp5 temp6



** check if for each vin, there are multiple dates
bysort vin calendar_day: gen temp=_N

count if temp>1
drop temp

count if user_id==.

***************
** merge claims
***************


sort user_id calendar_day

order user_id vin calendar_day 


merge m:1 vin calendar_day using working_data/claim_clean_further_07032019.dta
drop _merge


count if user_id==.

sort vin user_id
replace user_id=user_id[_n-1] if user_id==. & user_id[_n-1]~=. & vin==vin[_n-1]

list user_id vin if user_id==.

sort vin calendar_day

gen claim_ind=1 if claim_date~=""

sort vin calendar_day

order claim_ind, before(trip_start_date)

****************
** merge repairs
****************

merge m:1 vin calendar_day using working_data/repair_clean_further_070319.dta
drop _merge

count if user_id==.
sort vin user_id
replace user_id=user_id[_n-1] if user_id==. & user_id[_n-1]~=. & vin==vin[_n-1]

gen repair_ind=1 if repair_date~=""
order repair_ind, before(trip_start_date)

order quote_created_date claim_date repair_date, after(trip_start_date)



** some summary stats

sort vin calendar_day
bysort vin: gen day_index=_n
bysort vin: egen total_claims=total(claim_ind)
bysort vin: egen total_repairs=total(repair_ind)

count if day_index==1
count if day_index==1 & total_claims>0  // 906 agents have claims record
count if day_index==1 & total_repairs>0 // 1753 agents have repair record
count if drive_ind==1 & claim_ind==1	// only 137 claims happen on driving days
count if drive_ind==1 & repair_ind==1	// 842 repair records happen on driving days
count if drive_ind==1 & repair_ind==1 & (count_repair>0 | count_accident>0) // 447 repair related to accidents happen on driving days

sum total_claims if  day_index==1 & total_claims>0 // number of claims per person: 2.204194

gen calendar_day_num=date(calendar_day, "YMD")




********************************************
** Relationship between Claims and Near-Miss
********************************************


** for this purpose, drop quotes 
drop if drive_ind==0


** generate accident using claims data
gen have_accident=1 if total_claims>0 & claim_ind==1
replace have_accident=0 if total_claims>0 & claim_ind==.
// people who do not have claim records at all will have a missing value generated

forvalues i=1(1)10{
	forvalues j=1(1)`i'{
		
		display (`j')
		gen temp`j'=1 if drive_ind==1 & claim_ind[_n+`j']==1 & vin==vin[_n+`j'] & calendar_day_num[_n+`j']<=calendar_day_num+`i'
				
	}
	egen temp_total`i'=rowtotal(have_accident-temp`i')
	replace temp_total`i'=1 if temp_total`i'>1
	replace temp_total`i'=. if total_claims==0
	drop temp1-temp`i'
}




** generate accident using repair data
gen have_accident_repair=1 if total_repairs>0 & repair_ind==1 & (count_repair>0 | count_accident>0) & count_repair~=. & count_accident~=.
replace have_accident_repair=0 if total_repairs>0 & repair_ind==1 & count_repair==0 & count_accident==0
replace have_accident_repair=0 if total_repairs>0 & repair_ind==.
// people who do not have repair records at all will have a missing value generated

forvalues i=1(1)10{
	forvalues j=1(1)`i'{
		
		display (`j')
		gen temp`j'=1 if drive_ind==1 & repair_ind[_n+`j']==1 & (count_repair[_n+`j']>0 | count_accident[_n+`j']>0) ///
		& vin==vin[_n+`j'] & calendar_day_num[_n+`j']<=calendar_day_num+`i'
				
	}
	egen temp_total`i'_repair=rowtotal(have_accident_repair-temp`i')
	replace temp_total`i'_repair=1 if temp_total`i'_repair>1
	replace temp_total`i'_repair=. if total_repairs==0
	drop temp1-temp`i'
}




** generate dummies for NM
gen hb1=total_prev_hard_brake
gen hb2=total_real_hard_brake_1
gen hb3=total_hard_brake

forvalues i=1(1)3{
	gen hb`i'_dummy=1 if hb`i'>0
	replace hb`i'_dummy=0 if hb`i'_dummy==.
}

forvalues i=4(1)6{
	loc j=`i'-3
	display `j'

	gen hb`i'_dummy=1 if hb`j'>0 & total_hard_turn>0
	replace hb`i'_dummy=0 if hb`i'_dummy==.
}



***********************************************
* Compare NM freq when accident=1 or accident=0
***********************************************



keep if drive_ind==1


forvalues j=1(1)6{
	qui estpost ttest hb`j'_dummy, by(have_accident)
	
	matrix A = (e(mu_1), e(mu_2), e(p_l))
	matrix list A
	estout matrix(A) using $results/compare_nm_real.csv, append 

	
	forvalues i=1(1)3{
		qui estpost ttest hb`j'_dummy, by(temp_total`i')

		matrix A = (e(mu_1), e(mu_2), e(p_l))
		matrix list A
		estout matrix(A) using $results/compare_nm_real.csv, append 

	}
	
}



forvalues j=1(1)6{
	qui estpost ttest hb`j'_dummy, by(have_accident_repair)
	
	matrix A = (e(mu_1), e(mu_2), e(p_l))
	matrix list A
	estout matrix(A) using $results/compare_nm_real.csv, append 
	
	forvalues i=1(1)3{
		qui estpost ttest hb`j'_dummy, by(temp_total`i'_repair)
		
		matrix A = (e(mu_1), e(mu_2), e(p_l))
		matrix list A
		estout matrix(A) using $results/compare_nm_real.csv, append 

	}
}


***************************************************
* Predict accident using risky behavior (bypass NM)
***************************************************

drop day_index

sort vin calendar_day_num
bysort vin: gen day_index=_n

xtset user_id day_index


gen rain_snow=1 if rain==1 | snow==1 | rain_storm==1
replace rain_snow=0 if rain_snow==.


logit have_accident total_phone_use distance speed drive_at_night highway ///
	total_traffic_jam weekend rush_hour high_temper low_temper sunny rain_snow l.drive_score gap_time, robust 	
outreg2 using $results/pred_accident_072020.xls, replace

